# Install the data using seurat-disk below if data("panc8") does not exist,
# InstallData("panc8")
data("panc8")
pancreas.list <- SplitObject(panc8, split.by = "tech")
pancreas.list <-
pancreas.list[c("celseq", "celseq2", "fluidigmc1", "smartseq2")]
# normally, we use Read10X or read.table or read.csv
#dat.1 <- Read10X(data.dir = "")
#dat.2 <- Read10X(data.dir = "")
#dat.1.obj <- CreateSeuratObject(counts = dat.1, project = "stim", min.cells = 3, min.features = 200)
#dat.2.obj <- CreateSeuratObject(counts = dat.2, project = "ctrl", min.cells = 3, min.features = 200)
#Give a group name ("treatment") and sample labels (stil and ctrl) to both data
#dat.1.obj$treatment="stim"
#dat.2.obj$treatment="ctrl"
for (i in 1:length(pancreas.list)) {
pancreas.list[[i]] <-
NormalizeData(pancreas.list[[i]], verbose = FALSE)
pancreas.list[[i]] <-
FindVariableFeatures(
pancreas.list[[i]],
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE
)
}
reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")]
# Find integration anchors and integrate data
pancreas.anchors <-
FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
pancreas.integrated <-
IntegrateData(anchorset = pancreas.anchors, dims = 1:30)
# switch to integrated assay.
DefaultAssay(pancreas.integrated) <- "integrated"
# Run the standard workflow for visualization and clustering
pancreas.integrated <-
ScaleData(pancreas.integrated, verbose = FALSE)
pancreas.integrated <-
RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE)
pancreas.integrated <-
RunUMAP(
pancreas.integrated,
reduction = "pca",
dims = 1:30,
verbose = FALSE
)
# Number of cells in each condition
table(pancreas.integrated$tech)
##
## celseq celseq2 smartseq2
## 1004 2285 2394
table(pancreas.integrated@meta.data$tech,
pancreas.integrated@meta.data$celltype)
##
## acinar activated_stellate alpha beta delta ductal endothelial
## celseq 229 19 213 161 50 304 5
## celseq2 274 90 844 445 203 257 21
## smartseq2 188 55 1008 308 127 444 21
##
## epsilon gamma macrophage mast quiescent_stellate schwann
## celseq 1 18 1 1 1 1
## celseq2 4 110 15 6 12 4
## smartseq2 8 213 7 7 6 2
# Visualization
p1 <-
DimPlot(pancreas.integrated,
reduction = "umap",
group.by = "tech")
p2 <-
DimPlot(
pancreas.integrated,
reduction = "umap",
group.by = "celltype",
label = TRUE,
repel = TRUE
) + NoLegend()
p1 + p2
p3 <-
DimPlot(pancreas.integrated,
reduction = "umap",
group.by = "celltype")
LabelClusters(
p3,
id = "celltype",
color = unique(ggplot_build(p3)$data[[1]]$colour),
size = 5,
repel = T,
box.padding = 1.25
)
DimPlot(
pancreas.integrated,
reduction = "umap",
split.by = "tech",
label = TRUE
)
# Remember to switch to raw data for DEG
DefaultAssay(pancreas.integrated) <- "RNA"
# Find DEGs in celseq
celseq.markers <-
FindMarkers(
pancreas.integrated,
ident.1 = "celseq",
group.by = "tech",
logfc.threshold = 0.25,
only.pos = TRUE
)
# Find DEGs in alpha
Idents(pancreas.integrated) <- "celltype"
alpha.markers <-
FindMarkers(
pancreas.integrated,
ident.1 = "acinar",
logfc.threshold = 0.25,
only.pos = TRUE
)
# Find conserved DEGs among techs
conserve.markers <-
FindConservedMarkers(pancreas.integrated,
ident.1 = c("acinar"),
grouping.var = "tech")
# Find DEGs for all techs
Idents(pancreas.integrated) <- "tech"
all.markers <-
FindAllMarkers(pancreas.integrated,
logfc.threshold = 0.25,
only.pos = TRUE)
top_10_marker <-
all.markers %>% group_by(cluster) %>% top_n(n = 10, avg_log2FC)
head(top_10_marker)
## # A tibble: 6 x 7
## # Groups: cluster [1]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 5.60 0.581 0.006 0 celseq INS-IGF2
## 2 0 3.70 1 0.488 0 celseq ERCC-00002
## 3 0 3.39 1 0.488 0 celseq ERCC-00096
## 4 0 3.10 0.994 0.329 0 celseq ERCC-00136
## 5 0 3.07 1 0.488 0 celseq ERCC-00046
## 6 0 3.05 0.995 0.848 0 celseq RPS18
# Draw heatmap
DoHeatmap(
pancreas.integrated,
features = top_10_marker$gene,
slot = "counts",
size = 4
) +
scale_fill_gradientn(colors = RColorBrewer::brewer.pal(n = 9, name = "RdBu"))
VlnPlot(pancreas.integrated, features = c("REG1A"))
VlnPlot(pancreas.integrated,
features = c("REG1A"),
group.by = "celltype")
VlnPlot(
pancreas.integrated,
features = c("REG1A"),
group.by = "celltype",
split.by = "tech"
)
pancreas.integrated.sub <-
subset(pancreas.integrated, idents = c("celseq", "celseq2"))
VlnPlot(
pancreas.integrated.sub,
features = c("REG1A"),
group.by = "celltype",
split.by = "tech",
cols = c("red", "grey", "blue"),
pt.size = 0
)
FeaturePlot(
pancreas.integrated,
features = c("REG1A"),
split.by = "tech",
max.cutoff = 3,
cols = c("grey", "red")
)
Seurat also supports the projection of reference data (or meta data) onto a query object. While many of the methods are conserved (both procedures begin by identifying anchors), there are two important distinctions between data transfer and integration:
In data transfer, Seurat does not correct or modify the query expression data. In data transfer, Seurat has an option (set by default) to project the PCA structure of a reference onto the query, instead of learning a joint structure with CCA. We generally suggest using this option when projecting data between scRNA-seq datasets.
# Use the integrated assay
DefaultAssay(pancreas.integrated) <- "integrated"
# setup the reference (query) object list (optional)
pancreas.query <- pancreas.list[["fluidigmc1"]]
# Find anchors for transfer
pancreas.anchors <-
FindTransferAnchors(
reference = pancreas.integrated,
query = pancreas.query,
dims = 1:30,
reference.reduction = "pca"
)
predictions <-
TransferData(anchorset = pancreas.anchors,
refdata = pancreas.integrated$celltype,
dims = 1:30)
pancreas.query <-
AddMetaData(pancreas.query, metadata = predictions)
pancreas.query$prediction.match <-
pancreas.query$predicted.id == pancreas.query$celltype
table(pancreas.query$prediction.match)
##
## FALSE TRUE
## 21 617
pancreas.integrated <-
RunUMAP(
pancreas.integrated,
dims = 1:30,
reduction = "pca",
return.model = TRUE
)
pancreas.query <-
MapQuery(
anchorset = pancreas.anchors,
reference = pancreas.integrated,
query = pancreas.query,
refdata = list(celltype = "celltype"),
reference.reduction = "pca",
reduction.model = "umap"
)
p1 <-
DimPlot(
pancreas.integrated,
reduction = "umap",
group.by = "celltype",
label = TRUE,
label.size = 3,
repel = TRUE
) + NoLegend() + ggtitle("Reference annotations")
p2 <-
DimPlot(
pancreas.query,
reduction = "ref.umap",
group.by = "predicted.celltype",
label = TRUE,
label.size = 3,
repel = TRUE
) + NoLegend() + ggtitle("Query transferred labels")
p1 + p2
VlnPlot(pancreas.query,
features = c("REG1A"),
group.by = "celltype")
pancreas.merge <- merge(pancreas.integrated, pancreas.query)
VlnPlot(
pancreas.merge,
features = c("REG1A"),
group.by = "celltype",
split.by = "tech",
cols = c("red", "grey", "blue", "green"),
pt.size = 0
)
sankey.dat <-
data.frame(
source = pancreas.query$predicted.id,
target = pancreas.query$celltype,
value = rep(1, length(pancreas.query$celltype))
)
sankey.dat$new <- paste(sankey.dat$source, sankey.dat$target)
# create a connecting data frame of label pair frequencies
sankey.link <- aggregate(value ~ new, sankey.dat, sum)
sankey.link <-
separate(
sankey.link ,
col = new,
into = c("source", "target"),
sep = " "
)
sankey.link$target <- paste(sankey.link$target, " ", sep = "")
# create a node data frame of all unique labels
sankey.nodes <- data.frame(name = c(
as.character(sankey.link$source),
as.character(sankey.link$target)
) %>% unique())
# transfer target and source names to node numbers
sankey.link$IDsource <-
match(sankey.link$source, sankey.nodes$name) - 1
sankey.link$IDtarget <-
match(sankey.link$target, sankey.nodes$name) - 1
p <- sankeyNetwork(
Links = sankey.link,
Nodes = sankey.nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "value",
NodeID = "name",
sinksRight = FALSE,
fontSize = 15,
nodeWidth = 40,
nodePadding = 10
)
p
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
##
## Matrix products: default
## BLAS/LAPACK: /opt/intel/19.0.5/compilers_and_libraries_2019.5.281/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] networkD3_0.4 RCurl_1.98-1.4
## [3] cowplot_1.1.1 scales_1.1.1
## [5] Matrix_1.3-4 forcats_0.5.0
## [7] stringr_1.4.0 dplyr_1.0.1
## [9] purrr_0.3.4 readr_1.3.1
## [11] tidyr_1.1.1 tibble_3.0.3
## [13] ggplot2_3.3.5 tidyverse_1.3.0
## [15] SeuratObject_4.0.2 Seurat_4.0.4
## [17] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0 GenomicRanges_1.42.0
## [21] GenomeInfoDb_1.26.7 IRanges_2.24.1
## [23] S4Vectors_0.28.1 BiocGenerics_0.36.1
## [25] MatrixGenerics_1.2.1 matrixStats_0.60.1
## [27] panc8.SeuratData_3.0.2 SeuratData_0.2.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 sn_2.0.0
## [4] plyr_1.8.6 igraph_1.2.6 lazyeval_0.2.2
## [7] splines_4.0.2 listenv_0.8.0 scattermore_0.7
## [10] TH.data_1.1-0 digest_0.6.25 htmltools_0.5.2
## [13] fansi_0.4.1 magrittr_1.5 tensor_1.5
## [16] cluster_2.1.0 ROCR_1.0-11 limma_3.46.0
## [19] globals_0.14.0 modelr_0.1.8 sandwich_3.0-1
## [22] spatstat.sparse_2.0-0 colorspace_1.4-1 rvest_0.3.6
## [25] blob_1.2.1 rappdirs_0.3.1 ggrepel_0.9.1
## [28] rbibutils_2.2.4 haven_2.3.1 xfun_0.16
## [31] crayon_1.3.4 jsonlite_1.7.0 spatstat.data_2.1-0
## [34] survival_3.1-12 zoo_1.8-9 glue_1.4.1
## [37] polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.36.0
## [40] XVector_0.30.0 leiden_0.3.9 DelayedArray_0.16.3
## [43] future.apply_1.8.1 abind_1.4-5 mvtnorm_1.1-1
## [46] DBI_1.1.0 miniUI_0.1.1.1 Rcpp_1.0.7
## [49] plotrix_3.8-1 metap_1.5 viridisLite_0.4.0
## [52] xtable_1.8-4 tmvnsim_1.0-2 reticulate_1.20
## [55] spatstat.core_2.3-0 htmlwidgets_1.5.3 httr_1.4.2
## [58] RColorBrewer_1.1-2 TFisher_0.2.0 ellipsis_0.3.1
## [61] ica_1.0-2 farver_2.0.3 pkgconfig_2.0.3
## [64] uwot_0.1.10 dbplyr_1.4.4 deldir_0.1-28
## [67] utf8_1.1.4 labeling_0.3 tidyselect_1.1.0
## [70] rlang_0.4.11 reshape2_1.4.4 later_1.1.0.1
## [73] cellranger_1.1.0 munsell_0.5.0 tools_4.0.2
## [76] cli_2.0.2 generics_0.0.2 mathjaxr_1.4-0
## [79] broom_0.7.9 ggridges_0.5.3 evaluate_0.14
## [82] fastmap_1.1.0 yaml_2.2.1 goftest_1.2-2
## [85] fs_1.5.0 knitr_1.29 fitdistrplus_1.1-5
## [88] RANN_2.6.1 pbapply_1.4-3 future_1.22.1
## [91] nlme_3.1-148 mime_0.9 xml2_1.3.2
## [94] rstudioapi_0.11 compiler_4.0.2 plotly_4.9.4.1
## [97] png_0.1-7 spatstat.utils_2.2-0 reprex_0.3.0
## [100] stringi_1.4.6 lattice_0.20-41 multtest_2.46.0
## [103] vctrs_0.3.2 mutoss_0.1-12 pillar_1.4.6
## [106] lifecycle_0.2.0 Rdpack_2.1.2 spatstat.geom_2.2-2
## [109] lmtest_0.9-38 RcppAnnoy_0.0.19 data.table_1.14.0
## [112] bitops_1.0-7 irlba_2.3.3 httpuv_1.5.4
## [115] patchwork_1.1.1 R6_2.4.1 promises_1.1.1
## [118] KernSmooth_2.23-17 gridExtra_2.3 parallelly_1.27.0
## [121] codetools_0.2-16 MASS_7.3-51.6 assertthat_0.2.1
## [124] withr_2.2.0 mnormt_2.0.2 sctransform_0.3.2
## [127] multcomp_1.4-17 GenomeInfoDbData_1.2.4 mgcv_1.8-31
## [130] hms_0.5.3 grid_4.0.2 rpart_4.1-15
## [133] rmarkdown_2.3 Rtsne_0.15 numDeriv_2016.8-1.1
## [136] shiny_1.5.0 lubridate_1.7.9